* This is to compile the list of grade 10 and grade 12 students
* from baseline survey attendance sheets shared by local partner

* global list of schools

cls
global filelist: dir "$raw/manual/baseline_attendance_sheet_sample_school" files "*.xlsx"

* note that minjiwoong central school.xlsx and damphu central school.xlsx
* have no information on student ID
* considering matching student name

foreach excelfile of global filelist {
	if ~inlist("`excelfile'", "minjiwoong central school.xlsx", "damphu central school.xlsx") {
	clear
	import excel using "$raw/manual/baseline_attendance_sheet_sample_school/`excelfile'", describe
	local school_name=substr("`excelfile'", 1, length("`excelfile'") - 5) 
	
	* Open each sheet of the excel file
	forvalues sheet=1/`=r(N_worksheet)' {  
	local sheetname=r(worksheet_`sheet')  
	import excel using "$raw/manual/baseline_attendance_sheet_sample_school/`school_name'", sheet("`sheetname'") clear 
	save "$temp\file_`school_name'_`sheetname'", replace  
	clear
	}
	
	* keep necessary information
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
	use "$temp/`tfile'", clear
	cap keep A-E
	save, replace
	}
	
	clear
	
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
    append using "$temp/`tfile'", force
	erase "$temp/`tfile'"
	}
	
	* process data
	
	rename _all, lower
	keep b-e
	cap {
	rename b name
	rename c studentcode 
	rename d class
	rename e section 
	}
	

	cap g studentid=""
	cap replace studentid = studentcode
	cap replace studentid = studentcodewithemail
	cap drop if studentid=="" | regexm(studentid, "Drop|drop|student|cla|left") 
	cap drop if class=="" | regexm(class, "Drop|drop|Change|change") 
	cap drop if name=="" | regexm(name, "Name|name|student|code") 

	* revalue 
	g stream2 =.
	cap replace stream2=1 if stream=="Arts" 
	cap replace stream2=1 if regexm(section, "Art|art|ART")
	cap replace stream2=1 if regexm(class, "Art|art|ART")
	cap replace stream2=2 if stream=="Commerce"
	cap replace stream2=2 if regexm(section, "Com|com|COM")
	cap replace stream2=2 if regexm(class, "Com|com|COM")
	cap replace stream2=3 if stream=="Science"
	cap replace stream2=3 if regexm(section, "Sci|sci|SCI")
	cap replace stream2=3 if regexm(class, "Sci|sci|SCI")

	g grade=.
	cap replace grade=10 if regexm(class,"X|x|10")
	cap replace grade=10 if class==10
	cap replace grade=9 if regexm(class,"IX|Ix|iX|ix|9")
	cap replace grade=9 if class==9
	cap replace grade=11 if regexm(class,"XI|Xi|xI|xi|11")
	cap replace grade=11 if class==11
	cap replace grade=12 if regexm(class,"XII|Xii|XIi|XiI|xIi|xiI|xii|12")
	cap replace grade=12 if class==12

	g name2=""
	cap replace name2 = name
	cap replace name2 = nameofstudent 
	replace name2=lower(name2) 

	* check if studentid is in the correct format 
	replace studentid=subinstr(studentid,".","",.) 
	replace studentid=subinstr(studentid,"-","",.)
	replace studentid=subinstr(studentid," ","",.)
	replace studentid=subinstr(studentid,"  ","",.)
	replace studentid=substr(studentid,1,3) + "." + substr(studentid,4,5) + "." + substr(studentid,9,2) + "." + substr(studentid,11,4)
	
	* check if duplicated observations of same student-studentid 
	unique studentid
	bys studentid name2: g temp1=cond(_N==1, 0, _n)	
	tab temp1
	list name2 studentid grade stream2 if temp1~=0 & temp1~=1
	keep if temp1==0|temp1==1

	* check if studentid is unique
	duplicates tag studentid, g(temp2)
	tab temp2 
	list name2 studentid grade stream2 if temp2~=0
	keep if temp2==0

	* keep relevant variables
	keep name2 studentid stream2 grade

	* generate schoolname
	g schoolname2="`school_name'"
		
	* save as new dataset 	
	save "$temp/sl_`school_name'.dta", replace
}
}


* Append all data together 
clear

global tempfilelist: dir "$temp/" files "sl_*.dta"
	foreach tfile of global tempfilelist {
	append using "$temp/`tfile'"
	erase "$temp/`tfile'"
}

order name2 studentid grade stream2 schoolname2
format name2 %30s 
format studentid %20s
sort schoolname2 grade stream2 studentid

save "$temp/list_of_students.dta", replace

********************** minjiwoong central school.xlsx
* do not record studentid, plan to merge using name and grade
import excel using "$raw/manual/baseline_attendance_sheet_sample_school/minjiwoong central school.xlsx", describe

	* Open each sheet of the excel file
	forvalues sheet=1/`=r(N_worksheet)' {  
	local sheetname=r(worksheet_`sheet')  
	import excel using "$raw/manual/baseline_attendance_sheet_sample_school/minjiwoong central school.xlsx", sheet("`sheetname'") clear 
	save "$temp\file_minjiwoong_`sheetname'", replace  
	clear
	}
	
	* keep necessary information
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
	use "$temp/`tfile'", clear
	cap keep A-E
	save, replace
	}
	
	clear
	
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
    append using "$temp/`tfile'", force
	erase "$temp/`tfile'"
	}
	
	rename _all, lower
	keep b-e
	cap {
	rename b name
	rename c studentcode 
	rename d class
	rename e section 
	}
	

	cap g studentid="" 
	cap drop if name=="" | regexm(name, "Name|name|student|code") 

	* revalue 
	g stream2 =.

	g grade=.
	cap replace grade=10 if regexm(class,"X|x|10")
	cap replace grade=10 if class==10
	cap replace grade=9 if regexm(class,"IX|Ix|iX|ix|9")
	cap replace grade=9 if class==9
	cap replace grade=11 if regexm(class,"XI|Xi|xI|xi|11")
	cap replace grade=11 if class==11
	cap replace grade=12 if regexm(class,"XII|Xii|XIi|XiI|xIi|xiI|xii|12")
	cap replace grade=12 if class==12

	g name2=""
	cap replace name2 = name
	cap replace name2 = nameofstudent 
	replace name2=lower(name2) 
	
	* check if duplicated observations of same student  
	unique name2 grade
	bys name2 grade: g temp1=cond(_N==1, 0, _n)	
	tab temp1
	list name2 grade if temp1~=0 
	keep if temp1==0|temp1==1

	* keep relevant variables
	keep name2 studentid stream2 grade

	* generate schoolname
	g schoolname2="minjiwoong central school"
		
	* save as new dataset 	
	save "$temp/list_minjiwoong central school.dta", replace
	

********************** damphu central school.xlsx
* do not record studentid, plan to merge using name and grade
import excel using "$raw/manual/baseline_attendance_sheet_sample_school/damphu central school.xlsx", describe

	* Open each sheet of the excel file
	forvalues sheet=1/`=r(N_worksheet)' {  
	local sheetname=r(worksheet_`sheet')  
	import excel using "$raw/manual/baseline_attendance_sheet_sample_school/damphu central school.xlsx", sheet("`sheetname'") clear 
	save "$temp\file_damphu_`sheetname'", replace  
	clear
	}
	
	* keep necessary information
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
	use "$temp/`tfile'", clear
	cap keep A-E
	save, replace
	}
	
	clear
	
	global tempfilelist: dir "$temp/" files "file_*.dta"
	foreach tfile of global tempfilelist {
    append using "$temp/`tfile'", force
	erase "$temp/`tfile'"
	}
	
	rename _all, lower
	keep b-e
	cap {
	rename b name
	rename c studentcode 
	rename d class
	rename e section 
	}
	

	cap g studentid="" 
	cap drop if name=="" | regexm(name, "Name|name|student|code") 

	* revalue 
	g stream2 =.
	cap replace stream2=1 if stream=="Arts" 
	cap replace stream2=1 if regexm(section, "Art|art|ART")
	cap replace stream2=1 if regexm(class, "Art|art|ART")
	cap replace stream2=2 if stream=="Commerce"
	cap replace stream2=2 if regexm(section, "Com|com|COM")
	cap replace stream2=2 if regexm(class, "Com|com|COM")
	cap replace stream2=3 if stream=="Science"
	cap replace stream2=3 if regexm(section, "Sci|sci|SCI")
	cap replace stream2=3 if regexm(class, "Sci|sci|SCI")

	g grade=.
	cap replace grade=10 if regexm(class,"X|x|10")
	cap replace grade=10 if class==10
	cap replace grade=9 if regexm(class,"IX|Ix|iX|ix|9")
	cap replace grade=9 if class==9
	cap replace grade=11 if regexm(class,"XI|Xi|xI|xi|11")
	cap replace grade=11 if class==11
	cap replace grade=12 if regexm(class,"XII|Xii|XIi|XiI|xIi|xiI|xii|12")
	cap replace grade=12 if class==12
	
	g name2=""
	cap replace name2 = name
	cap replace name2 = nameofstudent 
	replace name2=lower(name2) 
	
	* check if duplicated observations of same student  
	unique name2 grade
	bys name2 grade: g temp1=cond(_N==1, 0, _n)	
	tab temp1
	list name2 grade if temp1~=0 
	keep if temp1==0|temp1==1

	* keep relevant variables
	keep name2 studentid stream2 grade

	* generate schoolname
	g schoolname2="damphu central school"
		
	* save as new dataset 	
	save "$temp/list_damphu central school.dta", replace	



